This is a popular dataset from Kaggle with information that allows for predicting sales prices and practicing feature engineering, RFs, and gradient boosting. In this project, I'll be focusing on data exploration and visualization.
The full data description is available here.
# import all packages and set plots to be embedded inline
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
# Plotting
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style("whitegrid")
sns.set()
Load in your dataset and describe its properties through the questions below. Try and motivate your exploration goals through this section.
data_path = "../datasets/P5/train.csv" #using the training dataset
df = pd.read_csv(data_path)
df.info()
df.describe()
Notes
MSSubClass), we should fix that early onobject to categorynull values, let's start by vizualizing them for better understanding, then getting rid of them. Also, we need to populate those with less null valuesSalePrice using regression, let's see how each individual variable correlate to it#Get a list of all numerical and categorical features
def cat_num_split(df_, y_inc = False):
"""Get seperate lists for numerical and categorical data"""
numerical_list = []
categorical_list = []
tmp = None
if y_inc:
tmp = df_.drop(columns=["SalePrice"]).columns.tolist()
else:
tmp = df_.columns.tolist()
for col in tmp:
if (df_[col].dtype == 'object' or str(df_[col].dtype) == 'category'):
categorical_list.append(col)
else:
numerical_list.append(col)
return numerical_list, categorical_list
num_list, cat_list = cat_num_split(df)
df[num_list]
df[cat_list]
#Create a copy of the dataset before any modifications
df2 = df.copy()
#Map every class number to CS{number} format
#df2['MSSubClass'] = df2['MSSubClass'].map('CS{}'.format)
df2['MSSubClass'] = df2['MSSubClass'].map(lambda x: 'CS{}'.format(x))
df2['MSSubClass']
#Recompile num/cat lists
num_list, cat_list = cat_num_split(df2)
SalePrice of a given house.In this section, investigate distributions of individual variables. If you see unusual points or outliers, take a deeper look to clean things up and prepare yourself to look at relationships between variables.
def label_null_mapping(df_):
label_nas = df_.isnull().sum().astype('bool')
label_nas = label_nas.index.map(lambda x: x if label_nas[x] else "")
return label_nas
label_nas = label_null_mapping(df2)
#Display missing values
plt.figure(figsize=(12,8))
plt.suptitle('Missing Values in the Dataset')
sns.heatmap(df2.isnull(), yticklabels=False, xticklabels=label_nas ,cbar = False, cmap='viridis')
plt.show()
#Remove columns with too many missing values
null_values_per_col = np.sum(df2.drop(["SalePrice"], axis=1).isnull(), axis=0)
max_na = int(df2.shape[0]/5.0) #allowing up to 1/5 (20%) of the data to be missing
cols_to_remove = []
for col in df2.drop(["SalePrice"],axis=1).columns.tolist():
if null_values_per_col[col] > max_na:
cols_to_remove.append(col)
df2.drop(col, axis=1, inplace=True)
print("New shape of the training set is: (%d,%d)" %df2.shape)
print("The removed columns are: " + str(cols_to_remove))
#Display again
label_nas = label_null_mapping(df2)
plt.figure(figsize=(12,8))
plt.suptitle('Missing Values in the Dataset')
sns.heatmap(df2.isnull(), yticklabels=False, xticklabels=label_nas ,cbar = False, cmap='viridis')
plt.show()
col_interest = [x for x in label_nas if x != ""]
print(col_interest)
# Fill the rest of null values
# With median for numerical, and mode for categorical
df2.fillna(df2.mean(), inplace = True)
for col in col_interest:
df2[col].fillna(df2[col].mode()[0], inplace = True)
To better represent categorical data, especially when ordinal
#remap some columns to have numerical data
rep_map_1 = ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
rep_map_2 = ["NA", "No", "Mn", "Av", "Gd"]
rep_map_3 = ["NA", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"]
rep_map_4 = ["NA", "MnWw", "GdWo", "MnPrv", "GdPrv"]
rep_map_5 = ["N", "P", "Y"]
rep_map_6 = ["NA", "Unf", "RFn", "Fin"]
rep_map_7 = np.linspace(1,10, 10).astype("int").astype("str").tolist()
rep_map_8 = ["Sev", "Mod", "Gtl"]
rep_map_9 = ["ELO", "NoSeWa", "NoSewr", "AllPub"]
rep_map_10 = ["IR3", "IR2", "IR1", "Reg"]
ordinal_var_dict = {"ExterCond": rep_map_1,
"ExterQual": rep_map_1,
"BsmtCond": rep_map_1,
"BsmtQual": rep_map_1,
"HeatingQC": rep_map_1,
"KitchenQual": rep_map_1,
"GarageCond": rep_map_1,
"BsmtExposure": rep_map_2,
"BsmtFinType1": rep_map_3,
"BsmtFinType2": rep_map_3,
"PavedDrive": rep_map_5,
"GarageFinish": rep_map_6,
#"OverallCond": rep_map_7,
#"OverallQual": rep_map_7,
"LandSlope": rep_map_8,
"Utilities": rep_map_9,
"LotShape": rep_map_10,
}
cols_ordinal = ["ExterCond",
"ExterQual",
"BsmtCond",
"BsmtQual",
"HeatingQC",
"KitchenQual",
"GarageCond",
"BsmtExposure",
"BsmtFinType1",
"BsmtFinType2",
"PavedDrive",
"GarageFinish",
"LandSlope",
"Utilities",
"LotShape"]
num_list, cat_list = cat_num_split(df2)
# Reencode ordinal categories
for var in ordinal_var_dict:
ordered_var = pd.api.types.CategoricalDtype(ordered = True,
categories = ordinal_var_dict[var])
df2[var] = df2[var].astype(ordered_var)
# Reencode the rest
for col in cat_list:
if col not in ordinal_var_dict:
df[col] = df[col].astype('category')
df2.info()
num_list, cat_list = cat_num_split(df2)
f = plt.figure(figsize=(14, 48))
gs = f.add_gridspec(len(num_list), 4)
for i, col in enumerate(num_list):
ix, iy= int(i/2), (2 if i%2 else 0)
ax = f.add_subplot(gs[ix,iy])
ax = sns.boxplot(df2[col]);
ax = f.add_subplot(gs[ix,iy+1])
ax = sns.distplot(df2[col], kde=False);
f.tight_layout()
cols_interest = [x for x in cat_list if x in cols_ordinal]
f = plt.figure(figsize=(20, 50))
gs = f.add_gridspec(len(cols_interest), 4)
for i, col in enumerate(cols_interest):
ax = f.add_subplot(gs[int(i/4),i%4])
#sns.boxplot(df2[col], kind="box");
ax = sns.countplot(df2[col])
ax.set_xticklabels(ax.get_xticklabels(), ha = 'right', rotation=60)
#sns.distplot(df2[col], kde=False);
# add annotations
n_points = df.shape[0]
col_counts = df2[col].value_counts()
locs, labels = plt.xticks() # get the current tick locations and labels
# loop through each pair of locations and labels
for loc, label in zip(locs, labels):
# get the text property for the label to get the correct count
count = col_counts[label.get_text()]
pct_string = '{:0.1f}%'.format(100*count/n_points)
# print the annotation just below the top of the bar
plt.text(loc, count+10, pct_string, ha = 'left', color = 'b', rotation=60, fontsize=10)
f.tight_layout()
cols_interest = [x for x in cat_list if not (x in cols_ordinal)]
f = plt.figure(figsize=(25, 100))
gs = f.add_gridspec(len(cols_interest), 5)
for i, col in enumerate(cols_interest):
ax = f.add_subplot(gs[int(i/5),i%5])
#sns.boxplot(df2[col], kind="box");
ax = sns.countplot(df2[col])
ax.set_xticklabels(ax.get_xticklabels(), ha = 'right', rotation=60)
#sns.distplot(df2[col], kde=False);
# add annotations
n_points = df.shape[0]
col_counts = df2[col].value_counts()
locs, labels = plt.xticks() # get the current tick locations and labels
# loop through each pair of locations and labels
for loc, label in zip(locs, labels):
# get the text property for the label to get the correct count
count = col_counts[label.get_text()]
pct_string = '{:0.1f}%'.format(100*count/n_points)
# print the annotation just below the top of the bar
plt.text(loc, count+10, pct_string, ha = 'left', color = 'b', rotation=60, fontsize=10)
f.tight_layout()
SalePrice and we will get rid of them in the next stepnum_to_drop = ['Id', 'LowQualFinSF', '3SsnPorch', 'PoolArea', 'BsmtFinSF2', 'BsmtHalfBath', 'KitchenAbvGr', 'EnclosedPorch', 'ScreenPorch', 'MiscVal']
cat_to_drop = ['Street', 'Utilities', 'LandSlope', 'Condition2', 'RoofMatl', 'Heating', 'GarageQual', 'GarageCond']
df2 = df2.drop(num_to_drop + cat_to_drop, axis=1)
MoSold is a periodic value, where 12 should be interpreted as closer to 1 than to 10. That's why it is better to apply a trigonometric transform to it if we try to fit a model to this data but that's beyond the scope of the project so I will skip it.
SalePrice and other numerical features are very skewed, will need a log transform to make them as close to a normal distribution as possible. I will start by visualising that aspect in SalePrice then proceed to fix it as well as the other features, and replot everything at the end.
#Check the skewness of SalePrice
skewness = df2['SalePrice'].skew()
f, ax = plt.subplots(figsize=(10, 5))
sns.distplot(df2['SalePrice'], color="b", kde=True);
ax.set(ylabel="Frequency")
ax.set(xlabel="SalePrice")
ax.set(title="SalePrice distribution (skew = {})".format(round(skewness,2)))
plt.show()
#Apply log1p to reduce skewness
skewness = np.log1p(df2['SalePrice']).skew()
f, ax = plt.subplots(figsize=(10, 5))
sns.distplot(np.log1p(df2['SalePrice']), color="b");
ax.set(ylabel="Frequency")
ax.set(xlabel="SalePrice")
ax.set(title="SalePrice distribution (skew = {})".format(round(skewness,2)))
plt.show()
SalePrice, we will apply the same transform to all skewed numerical dataskewness1 = df2['SalePrice'].skew()
skewness2 = np.log1p(df2['SalePrice']).skew()
f = plt.figure(figsize=(15, 5))
gs = f.add_gridspec(1, 2)
ax = f.add_subplot(gs[0,0])
sns.distplot(df2['SalePrice'], color="b", kde=True);
ax.set(title="(skew = {})".format(round(skewness1,2)))
#ax.set(ylabel="Frequency")
#ax.set(xlabel="SalePrice")
ax = f.add_subplot(gs[0,1])
ax = sns.distplot(np.log1p(df2['SalePrice']), color="b");
ax.set(title="(skew = {})".format(round(skewness2,2)))
f.suptitle('Sale Price distribution', fontsize=16)
f.tight_layout()
num_list, cat_list = cat_num_split(df2)
# Log transform of the skewed numerical features to lessen impact of outliers
skewness = df2[num_list].skew()
skewness = skewness[abs(skewness) > 0.62] #Less than this and it will transform YearBuilt
print(str(skewness.shape[0]) + " skewed numerical features to log transform")
skewed_features = skewness.index
print(skewed_features)
df2[skewed_features] = np.log1p(df2[skewed_features])
f = plt.figure(figsize=(14, 48))
gs = f.add_gridspec(len(num_list), 4)
for i, col in enumerate(num_list):
ix, iy= int(i/2), (2 if i%2 else 0)
ax = f.add_subplot(gs[ix,iy])
ax = sns.boxplot(df2[col]);
ax = f.add_subplot(gs[ix,iy+1])
ax = sns.distplot(df2[col], kde=False);
f.tight_layout()
SalePrice and several other variables were skewed, I need to normalize them by applying np.log1p.MoSale could also be transformed to better show its periodic nature, but that's not needed for this project.In this section, investigate relationships between pairs of variables in your data. Make sure the variables that you cover here have been introduced in some fashion in the previous section (univariate exploration).
#Correlation matrix
corr_mat = df2.corr().abs()
# Find most important features relative to target
corr_mat.sort_values(["SalePrice"], ascending = False, inplace = True)
print(corr_mat.SalePrice)
cols_top_corr = corr_mat.SalePrice.index[1:13].tolist()
SalePrice. #Correlation matrix between features
corr_mat = df2.drop(columns=["SalePrice"]).corr().abs()
#Plot the heatmap with seaborn
plt.figure(figsize=(15,15))
sns.heatmap(corr_mat,
xticklabels=corr_mat.columns,
yticklabels=corr_mat.columns)
#List the highly correlated columns
corr_tmp = corr_mat.unstack()
corr_tmp = corr_tmp.sort_values(kind="quicksort", ascending = True)
print(corr_tmp[-len(corr_mat)-20:-len(corr_mat)])
corr_to_drop = ["GarageYrBlt", "GarageCars", "GrLivArea", "TotalBsmtSF"]
cols_top_corr = [x for x in cols_top_corr if not (x in corr_to_drop)]
df2 = df2.drop(corr_to_drop, axis=1)
num_list, cat_list = cat_num_split(df2)
f = plt.figure(figsize=(25, 100))
gs = f.add_gridspec(len(cat_list), 5)
for i, col in enumerate(cat_list):
ax = f.add_subplot(gs[int(i/5),i%5])
#sns.boxplot(df2[col], kind="box");
ax = sns.boxplot(x=col, y="SalePrice", data=df2)
#sns.distplot(df2[col], kde=False);
ax.set_xticklabels(ax.get_xticklabels(), ha = 'right', rotation=60)
f.tight_layout()
g = sns.pairplot(df2[num_list], size=4, kind="reg");
g = sns.pairplot(df2[cols_top_corr + ['SalePrice']], size=4, kind="reg");
SalePrice.SalePrice, mostly Quality related, and include Basment, Kitchen, Heating, Drive Way ...SalePrice, with the most irregular shaped ones fetching the highest prices.1stFlrSF and GarageArea show a very high correlation.YearBuilt and GarageArea, year over year, Garage areas has been increasing, this is the opposite of my expectations since I thought the increasing demand and prices of land will push the garage area to a lower average.YearBuilt and Overall Quality variable, not surprising considering the higher standards and more advanced technologies.Create plots of three or more variables to investigate your data even further. Make sure that your investigations are justified, and follow from your work in the previous sections.
cols_ordinal = [x for x in df2.columns.tolist() if x in cols_ordinal] # or x in cols_top_corr)]
cols_nominal = [x for x in df2.columns.tolist() if not (x in cols_ordinal)]
cols_interest.append('SalePrice')
cols_stripplot = ['OverallQual', 'FullBath', 'TotRmsAbvGrd', 'Fireplaces']
f = plt.figure(figsize=(100, 100))
gs = f.add_gridspec(len(cols_top_corr), len(cols_ordinal))
for i, col in enumerate(cols_top_corr):
for j, col2 in enumerate(cols_ordinal):
ax = f.add_subplot(gs[i,j])
#sns.boxplot(df2[col], kind="box");
#{x, y}_jitter does nothing iwth scatterplot according to docs
if col in cols_stripplot:
ax = sns.stripplot(x=col, y="SalePrice", hue=col2, data=df2, alpha = 0.3, jitter = 0.2)
else:
ax = sns.scatterplot(x=col, y="SalePrice", hue=col2, data=df2, alpha = 0.3)
#sns.distplot(df2[col], kde=False);
#ax.set_xticklabels(ax.get_xticklabels(), ha = 'right', rotation=60)
f.tight_layout()